/**
 * 
 */
package org.dse.proc;

/**
 * This interface provides methods to access and process the raw files. The
 * class which implements it will contain an algorithm to process the raw
 * content.
 * 
 * @author zhangchen
 * 
 */
public interface RawHtmlDocumentManager {

	/**
	 * According to the task file, read the data in the raw file, and create
	 * index.
	 * 
	 * @param taskFile
	 *            the file containing the raw files which will be processed and
	 *            the urls which will be deleted
	 */
	public void indexFiles(String taskFile);

	/**
	 * Get the url string of a html document according to the startpos of the
	 * html document in the raw file.
	 * 
	 * @param rawFileName
	 *            the raw file containing the document
	 * @param offset
	 *            the start pos in the raw file
	 * @return a String representing the url
	 */
	public String getUrl(String rawFileName, int offset);

	/**
	 * Get the full content of a html document according to the startpos of the
	 * html document in the raw file.
	 * 
	 * @param rawFileName
	 *            the raw file containing the document
	 * @param offset
	 *            the start pos in the raw file
	 * @return a String storing the content
	 */
	public String getHtmlContent(String rawFileName, int offset);
}
